from clustergrammer2 import net
df = {}
import clustergrammer_groupby as cby
import gene_exp_10x
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
%matplotlib inline
def calc_mean_var_disp(df_inst):
mean_arr = []
var_arr = []
mean_names = []
for inst_gene in df_inst.index.tolist():
mean_arr.append( df_inst.loc[inst_gene].mean() )
var_arr.append(df_inst.loc[inst_gene].var())
mean_names.append(inst_gene)
ser_mean = pd.Series(data=mean_arr, index=mean_names)
ser_var = pd.Series(data=var_arr, index=mean_names)
return ser_mean, ser_var
def cell_umi_count(df):
sum_arr = []
sum_names = []
for inst_cell in df:
sum_arr.append( df[inst_cell].sum() )
sum_names.append(inst_cell)
ser_sum = pd.Series(data=sum_arr, index=sum_names)
return ser_sum
df = gene_exp_10x.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df.shape
all_genes = df.index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))
df = df.loc[keep_genes]
df.shape
# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']
all_genes = df.index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or
x.split('_')[0] in list_mito_genes]
print(mito_genes)
keep_genes = [x for x in all_genes if x not in mito_genes]
df = df.loc[keep_genes]
ser_sum = cell_umi_count(df)
df = df.div(ser_sum)
print(df.shape)
print(df.sum().head())
ser_mean, ser_var = calc_mean_var_disp(df)
num_keep_umi = 10000
num_top_var = 250
# filter for top expressing genes
keep_mean = ser_mean.sort_values(ascending=False)[:num_keep_umi].index.tolist()
df = df.loc[keep_mean]
ser_keep_var = ser_var[keep_mean]
# filter for top variance based
keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]
# ArcSinh transform
df = np.arcsinh(df/5)
# Z-score genes
net.load_df(df)
net.normalize(axis='row', norm_type='zscore')
# round to two decimal points
df = net.export_df().round(2)
print(df.shape)
net.load_df(df.loc[keep_var])
net.clip(lower=-5, upper=5)
net.widget()
net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')
net.normalize(axis='row', norm_type='zscore')
df_sig = net.export_df()
print(df_sig.shape)
rows = df_sig.index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df_sig.index = new_rows
ct_color = {}
ct_color['T cells CD8'] = 'red'
ct_color['T cells CD4 naive'] = 'blue'
ct_color['T cells CD4 memory activated'] = 'blue'
ct_color['T cells CD4 memory resting'] = '#87cefa' # sky blue
ct_color['B cells naive'] = 'purple'
ct_color['B cells memory'] = '#DA70D6' # orchid
ct_color['NK cells activated'] = 'yellow'
ct_color['NK cells resting'] = '#FCD116' # sign yellow
ct_color['Monocytes'] = '#98ff98' # mint green
ct_color['Macrophages M0'] = '#D3D3D3' # light grey
ct_color['Macrophages M1'] = '#C0C0C0' # silver
ct_color['Macrophages M2'] = '#A9A9A9' # dark grey
ct_color['N.A.'] = 'white'
def set_cat_colors(axis, cat_index, cat_title=False):
for inst_ct in ct_color:
if cat_title != False:
cat_name = cat_title + ': ' + inst_ct
else:
cat_name = inst_ct
inst_color = ct_color[inst_ct]
net.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color)
set_cat_colors('col', 1)
gene_sig = df_sig.idxmax(axis=1)
gs_dict = {}
for inst_gene in gene_sig.index.tolist():
gs_dict[inst_gene] = gene_sig[inst_gene][0]
df_sig_cat = deepcopy(df_sig)
rows = df_sig_cat.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df_sig_cat.index = new_rows
net.load_df(df_sig_cat)
set_cat_colors('row', 1, 'Cell Type')
net.load_df(df_sig_cat)
net.clip(lower=-5, upper=5)
net.widget()
df_pred_cat, df_sig_sim, y_info = cby.predict_cats_from_sigs(df, df_sig,
predict_level='Cell Type', unknown_thresh=0.05)
df.columns = df_pred_cat.columns.tolist()
print(df_pred_cat.shape)
df_sig_sim = df_sig_sim.round(2)
net.load_df(df_sig_sim)
set_cat_colors('col', 1, cat_title='Cell Type')
set_cat_colors('row', 1)
df_sig_sim.columns = df_pred_cat.columns.tolist()
net.load_df(df_sig_sim)
net.widget()
rows = df_pred_cat.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df_pred_cat.index = new_rows
net.load_df(df_pred_cat)
net.clip(lower=-5, upper=5)
net.widget()
df = df.loc[keep_var]
rows = df.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df.index = new_rows
net.load_df(df)
net.clip(lower=-5, upper=5)
net.widget()